from pysbd import Segmenter
from support.utils.toolkit import Log
class TextDoc:
    LAG_ZH = ["。", "？", "！", "”"]

    def __init__(self, related=5):
        self.segment_related_num = related

    def getSentenceBoundaryFrom(self, filepath):
        segmenter = Segmenter()
        lines = []
        line = ""
        lastline = ""
        with open(filepath, 'r', encoding='utf-8') as file:
            while True:
                tmp = file.readline()
                if(tmp):
                    line += tmp.strip()
                    if(len(line) > 0 and line[-1] in TextDoc.LAG_ZH):
                        lines.append(line)
                        line = ""
                    if(len(lines) >= self.segment_related_num):
                        seg=segmenter.segment(' '.join(lines))
                        if(len(seg)):
                            if(seg[0] == lastline):seg.pop(0)
                            yield seg
                        lastline = lines.pop()
                        lines.clear()
                        lines.append(lastline)
                else:
                    seg=segmenter.segment(' '.join(lines))
                    if(len(seg)):yield seg
                    break
